{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "9a2fc3e2-9a78-4d18-b809-391b7d9a9430",
   "metadata": {},
   "source": [
    "### PhyloP test code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f3cd2ccf-692a-4ddb-918c-a4a6ac8c5e0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path \n",
    "import pyBigWig\n",
    "import numpy as np\n",
    "import pandas as pd "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1517a9a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "phylop_scores_bw = wkdir_path.joinpath(\"reference/conservation/phylop/hg38.phyloP100way.bw\")\n",
    "sv_bed_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_misexp_genes.bed\")\n",
    "chrom=\"chr21\"\n",
    "out_dir = wkdir_path.joinpath(\"5_misexp_vrnts/scores/phylop\")\n",
    "out_dir.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "fa833161",
   "metadata": {},
   "outputs": [],
   "source": [
    "bed_columns={0:\"chrom\", 1:\"start\", 2:\"end\", 3:\"vrnt_id\"}\n",
    "sv_bed_df = pd.read_csv(sv_bed_path, sep=\"\\t\", header=None).rename(columns=bed_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3e1743f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/nfs/users/nfs_t/tv5/.conda/envs/tv5_base/lib/python3.7/site-packages/ipykernel_launcher.py:15: RuntimeWarning: All-NaN axis encountered\n",
      "  from ipykernel import kernelapp as app\n",
      "/nfs/users/nfs_t/tv5/.conda/envs/tv5_base/lib/python3.7/site-packages/ipykernel_launcher.py:16: RuntimeWarning: All-NaN axis encountered\n",
      "  \n",
      "/nfs/users/nfs_t/tv5/.conda/envs/tv5_base/lib/python3.7/site-packages/ipykernel_launcher.py:17: RuntimeWarning: Mean of empty slice\n",
      "  app.launch_new_instance()\n",
      "/nfs/users/nfs_t/tv5/.local/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1120: RuntimeWarning: All-NaN slice encountered\n",
      "  overwrite_input=overwrite_input)\n"
     ]
    }
   ],
   "source": [
    "phylop_bw = pyBigWig.open(str(phylop_scores_bw))\n",
    "phylop_consv_vrnt_results = {}\n",
    "for index, row in sv_bed_df.iterrows():\n",
    "    chrom = row['chrom']\n",
    "    start = row['start']\n",
    "    end = row['end']\n",
    "    vrnt_id = row[\"vrnt_id\"]\n",
    "    \n",
    "    # query the BigWig file for the given interval\n",
    "    values = phylop_bw.values(chrom, start, end)\n",
    "    if len(values) != end - start: \n",
    "        print(gene_id)\n",
    "    \n",
    "    # ignores NaNs\n",
    "    min_value = np.nanmin(values) if values is not None else None\n",
    "    max_value = np.nanmax(values) if values is not None else None\n",
    "    mean_value = np.nanmean(values) if values is not None else None\n",
    "    median_value = np.nanmedian(values) if values is not None else None\n",
    "    \n",
    "    # add metrics to dictionary \n",
    "    phylop_consv_vrnt_results[index] = [chrom, start, end, vrnt_id, min_value, max_value, mean_value]\n",
    "# close the BigWig file\n",
    "phylop_bw.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61b0a89a",
   "metadata": {},
   "outputs": [],
   "source": [
    "pylop_results_columns=[\"chrom\", \"start\", \"end\", \"vrnt_id\", \"phylop_min\", \"phylop_max\", \"phylop_mean\"]\n",
    "phylop_consv_vrnt_df = pd.DataFrame.from_dict(phylop_consv_vrnt_results, orient=\"index\", columns=pylop_results_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75c54072",
   "metadata": {},
   "outputs": [],
   "source": [
    "phylop_consv_vrnt_path = out_dir.joinpath(\"inactive_gene_phylop_gene_body.tsv\")\n",
    "phylop_consv_vrnt_df.to_csv(phylop_consv_vrnt_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2249c83c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
